Contents

%run set_theme.ipynb
import pandas as pd
import plotly.graph_objects as go
from plotly.offline import init_notebook_mode

init_notebook_mode()
full = pd.read_parquet('../data/SO_2014_2022.pq')
df = full.groupby('Gender')['Education'].value_counts()
res = pd.DataFrame()

res['master/doctorate'] = df.loc[:, 'doctor'] + df.loc[:, 'professional'] + df.loc[:, 'master']
res['bachelor/tertiary'] = df.loc[:, 'bachelor'] + df.loc[:, 'tertiary'] + df.loc[:, 'assoc']
res['secondary'] = df.loc[:, 'secondary']
res['primary'] = df.loc[:, 'primary'] + df.loc[:, 'none']
res = res.cumsum(axis=1)
res.loc['male'] *= 1 / res.loc['male']['primary'] * 100
res.loc['female'] *= 1 / res.loc['female']['primary'] * 100
res
C:\Users\Efe\AppData\Local\Temp\ipykernel_19056\1555502919.py:9: FutureWarning:

Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value '27.144707442554093' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.

C:\Users\Efe\AppData\Local\Temp\ipykernel_19056\1555502919.py:9: FutureWarning:

Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value '89.21028324328788' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.

C:\Users\Efe\AppData\Local\Temp\ipykernel_19056\1555502919.py:9: FutureWarning:

Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value '98.91687436465058' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
master/doctorate bachelor/tertiary secondary primary
Gender
female 28.899066 91.328082 99.219637 100
male 27.144707 89.210283 98.916874 100
blue_alpha = 'rgba(91,111,236,0.3)'
pink_alpha = 'rgba(248,84,238,0.3)'

blue = '#5b6fec'
pink = '#f854ee'

sankey_plot = go.Sankey(
    arrangement='snap',
    node={
        'pad': 15,
        'thickness': 20,
        'x': [0.05, 0.33, 0.62, 0.9,
              0.05, 0.33, 0.62, 0.9, -1],
        'y': [0.3, 0.3, 0.735, 0.5865,
              0.2, 0.24, 0.2, 0.05, 0.1],
        'customdata': [
            "", 
            "All valid answers from male<br>respondents were used",
            "Male respondents, secondary degree",
            "Male respondents, tertiary education",
            "Male respondents, master / doctorate",
            "All valid answers from female<br>respondents were used",
            "Female respondents, secondary degree",
            "Female respondents, tertiary education",
            "Female respondents, master / doctorate"
        ],
        'hovertemplate': '%{customdata}<extra></extra>',
        'color': ['green', blue, blue, blue, blue, pink, pink, pink, pink]
    },
    link={
        'source': [1, 1, 2, 2, 3, 3, 4, 5, 5, 6, 6, 7, 7, 8, 9, 9],
        'target': [2, 0, 3, 0, 4, 0, 0, 6, 0, 7, 0, 8, 0, 0, 1, 5],
        'value': [
            res.secondary['male'],
            0,
            res['bachelor/tertiary']['male'],
            0,
            res['master/doctorate']['male'],
            0,
            0, 
            res.secondary['female'], 0,
            res['bachelor/tertiary']['female'],
            0,
            res['master/doctorate']['female'],
            0,
            0,
            100, 
            100
        ],
        'customdata': [
            "Almost all male respondents<br>completed secondary education", "",
            "Most male respondents continued to tertiary education,<br>though slightly less than female respondents",
            "",
            "Over a quarter of male respondents continued to a master or doctorate,<br>though slightly less than female respondents",
            "",
            "",
            "Almost all female respondents<br>completed secondary education", 
            "",
            "Most female respondents continued to tertiary education,<br>though slightly more than male respondents",
            "",
            "Over a quarter of female respondents continued to a master or doctorate,<br>though slightly more than male respondents",
        ],
        'hovertemplate': '%{customdata}<extra></extra>',
        'arrowlen': 14,
        'color': [
            blue_alpha, blue_alpha, blue_alpha, blue_alpha, blue_alpha, blue_alpha, blue_alpha,
            pink_alpha, pink_alpha, pink_alpha, pink_alpha, pink_alpha, pink_alpha, pink_alpha,
            'rgba(255,255,255,0)', 'rgba(255,255,255,0)',
        ]
    }
)

fig = go.Figure(data=[sankey_plot])

fig.update_layout(
    title_text='Educational Progression By Gender<br>' + \
               '<sup>Approximately equal, slightly higher for female respondents</sup>',
    margin={'b': 130, 't': 100},
    width=790,
)

fig.update_traces(
    hoverlabel={'font_color': 'white', 'bordercolor': 'white'},
)

font_size = 15
bottom_labels_y = -0.1

# Labels below the bars.
# ======================
fig.add_annotation(x=-0.02, y=bottom_labels_y,
                   text="All respondents",
                   showarrow=False)

fig.add_annotation(x=0.28, y=bottom_labels_y,
                   text="Secondary",
                   showarrow=False)

fig.add_annotation(x=0.62, y=bottom_labels_y,
                   text="Bachelor/Tertiary",
                   showarrow=False)

fig.add_annotation(x=0.955, y=bottom_labels_y,
                   text="Master/PhD",
                   showarrow=False)

# Percentage labels for the top female categories.
# ================================================
fig.add_annotation(x=0.14, y=0.965,
                   text=f"{res.secondary['female']:.1f}%",
                   font={'size': font_size},
                   showarrow=False)

fig.add_annotation(x=0.47, y=0.965,
                   text=f"{res['bachelor/tertiary']['female']:.1f}%",
                   font={'size': font_size},
                   showarrow=False)

fig.add_annotation(x=0.79, y=0.97,
                   text=f"{res['master/doctorate']['female']:.1f}%",
                   font={'size': font_size},
                   showarrow=False)


# Percentage labels for the bottom male categories.
# ==============================================
fig.add_annotation(x=0.14, y=0.415,
                   text=f"{res.secondary['male']:.1f}%",
                   font={'size': font_size},
                   showarrow=False)

fig.add_annotation(x=0.47, y=0.415,
                   text=f"{res['bachelor/tertiary']['male']:.1f}%",
                   font={'size': font_size},
                   showarrow=False)

fig.add_annotation(x=0.79, y=0.415,
                   text=f"{res['master/doctorate']['male']:.1f}%",
                   font={'size': font_size},
                   showarrow=False)

# Labels on the left.
# ===================
fig.add_annotation(x=-0.06, y=0.80,
                   text="Female",
                   showarrow=False)
fig.add_annotation(x=-0.04, y=0.20,
                   text="Male",
                   showarrow=False)


# Caption with explanation.
fig.add_annotation(x=-0.08, y=-0.36,
                   showarrow=False,
                   xanchor='left',
                   yanchor='bottom',
                   align='left',
                   text='The brightly colored bars represent educational levels. Each percentage indicates those that graduated from the<br>' +
                        'educational level on the right hand side of the percentage, coming from the previous educational level on the left<br>' +
                        'hand side of the percentage.')

fig.show()